# Alexander F. Gazmararian
# agazmararian@gmail.com

library(here)
library(tidyverse)
library(tidylog)
library(janitor)
library(stdidx)
library(glue)

# Replication mode detection ----
# In anonymized mode, use the pre-cached survey file which includes FIPS codes
# Re-processing would lose geographic identifiers needed for merge
source(here("R", "visibility", "replication_mode.R"))
REPLICATION_MODE <- init_replication_mode()

cache_file <- here("data", "cache", "survey_visibility_processed.rds")

if (REPLICATION_MODE == "anonymized") {
  if (file.exists(cache_file)) {
    message("=== ANONYMIZED MODE ===")
    message("Using cached survey data with geographic identifiers")
    message("Skipping re-processing to preserve FIPS codes needed for merge")
    message(glue("[OK] Cached survey data available at: {cache_file}"))
  } else {
    stop("Anonymized mode requires cached survey data. File not found: ", cache_file, "\n",
         "The anonymized replication package requires pre-processed survey data.\n",
         "This cache should be generated by running the pipeline in full mode first.")
  }
} else {

# Has credit questions
g1 <- read.csv(here("data", "input", "qualtrics_spring_2024", "Climate Finance - Spring 2024 - US_April 11, 2024_07.58.csv"))
g1 <- g1[-c(1,2),]
g1$sample <- "Spring2024Finance"

# Has credit questions
g2 <- read.csv(here("data", "input", "qualtrics_summer_2024", "CBAM Summer 2024_January 21, 2025_08.46.csv"))
g2 <- g2[-c(1,2),]
g2$sample <- "Summer2024CBAM"

# Credit questions are part of a survey experiment; don't use
g3 <- read.csv(here("data", "input", "qualtrics_spring_2024_roosevelt", "Roosevelt Spring 2024 Survey_January 21, 2025_08.49.csv"))
g3 <- g3[-c(1,2),]
g3$sample <- "Spring2024Roosevelt"

g <- bind_rows(g1, g2, g3)

# Apply covariate cleaning script that's universal for all projects
source(here("R", "surveys", "clean_qualtrics_covariates.R"))

# Select relevant covariates
g <- subset(
  g,
  select = c(StartDate:consent, greenproj:greenproj_yes_Click.Count,
             ClimChange, HumanCause, GlobalWarm, employ, hispanic,
             ffemploy, term, gc, qualtricsZip, Zipcode, County, State, qualtricsCountry, qualtricsState, qualtricsCity,
             credit_1:credit_oe_Click.Count, black:income_quota, sample, greenproj_type_1:greenproj_type_6)
)
g <- clean_names(g)

# Rename Qualtrics metadata column to avoid conflicts with marginaleffects
# In anonymized mode, location columns may not exist (privacy protection)
g <- g %>%
  rename(
    term_qc = term,  # Qualtrics quality control field
    zip_survey = zipcode,
    zip_ip = qualtrics_zip,
    county_survey = county,
    state_survey = state,
    country_ip = qualtrics_country,
    state_ip = qualtrics_state,
    city_ip = qualtrics_city
  )

# Handle location columns (required in full mode)
if ("location_longitude" %in% names(g) && "location_latitude" %in% names(g)) {
  g <- g %>%
    rename(
      lon_ip = location_longitude,
      lat_ip = location_latitude
    ) %>%
    mutate(
      lon_ip = as.numeric(lon_ip),
      lat_ip = as.numeric(lat_ip)
    )
  message("[INFO] Location columns found and renamed (full mode)")
} else {
  stop("Full mode processing requires location columns in raw Qualtrics data.\n",
       "Expected columns: location_longitude, location_latitude\n",
       "Available columns: ", paste(names(g)[grepl("location|lat|lon", names(g), ignore.case = TRUE)], collapse = ", "),
       "\n\nIf running in anonymized mode, ensure the cache file exists: ", cache_file)
}

# Process survey answers ----
g$start_date <- as.Date(g$start_date)
g$clim_change_bin <- ifelse(g$clim_change == "Climate change is happening", 1, 0)
g$human_cause_bin <- ifelse(g$human_cause == "Humans are causing climate change", 1, 0)
g$global_warm_bin <- ifelse(g$global_warm == "Immediate and drastic action is necessary.", 1, 0)
g$gw_idx <- idx_invcov(g$global_warm_bin, g$clim_change_bin, g$human_cause_bin)

g$greenproj <- factor(g$greenproj)
g <- mutate(g, across(c(greenproj_no, greenproj_yes), ~ ifelse(.x == "", NA, .x)))
g$greenproj_bin <- ifelse(g$greenproj == "Yes", 1, 0)

g <- g %>%
  rename(
    proj_solar = greenproj_type_1,
    proj_wind = greenproj_type_2,
    proj_geo = greenproj_type_3,
    proj_bat = greenproj_type_4,
    proj_ev = greenproj_type_5,
    proj_h = greenproj_type_6
  )

g <- g %>%
  rename(
    credit_biden = credit_1,
    credit_cong = credit_2,
    credit_gov = credit_3,
    credit_state = credit_4,
    credit_com = credit_5,
    credit_market = credit_6
  )

g <- g %>% 
  mutate(
    across(
      c(credit_biden,credit_cong,credit_gov,credit_state,credit_com,credit_market),
      ~ factor(
        .x, 
        ordered = TRUE, 
        levels = c("Not at all responsible", "Not too responsible", "Moderately responsible", "Very responsible", "Extremely responsible")
        )
      )
    )

g <- g %>%
  mutate(
    across(
      c(credit_biden,credit_cong,credit_gov,credit_state,credit_com,credit_market),
      ~ case_when(
        .x %in% c("Extremely responsible", "Very responsible") ~ 1,
        is.na(.x) ~ NA_real_,
        TRUE ~ 0
        ),
      .names = "{.col}_bin"
      )
    )

# Finally process green project benefits---only on one survey
g <- g %>% mutate(across(c(greenproj_no, greenproj_yes), ~ ifelse(.x == "", NA, .x)))
g$greenbenefit <- coalesce(g$greenproj_no, g$greenproj_yes)
g <- g %>%
  mutate(
    greenbenefit = case_when(
      grepl("Greatly benef", greenbenefit) ~ "Greatly benefit",
      grepl("Somewhat benef", greenbenefit) ~ "Somewhat benefit",
      grepl("Have|Had", greenbenefit) ~ "No effect",
      grepl("Somewhat harm", greenbenefit) ~ "Somewhat harm",
      grepl("Greatly harm", greenbenefit) ~ "Greatly harm",
      T ~ NA_character_
    ),
    greenbenefit = factor(greenbenefit, ordered = TRUE, levels = c("Greatly harm", "Somewhat harm", "No effect", "Somewhat benefit", "Greatly benefit")),
    greenbenefit_num = as.numeric(greenbenefit),
    greenbenefit_bin = case_when(
      greenbenefit %in% c("Greatly benefit", "Somewhat benefit") ~ 1,
      greenbenefit %in% c("Greatly harm", "Somewhat harm", "No effect") ~ 0,
      T ~ NA_real_
    )
  )

saveRDS(g, here("data", "cache", "survey_visibility_processed.rds"))
message(glue("Number of respondents: {nrow(g)}"))
message(glue("[OK] Survey processed and saved to: {here('data', 'cache', 'survey_visibility_processed.rds')}"))

} # End of REPLICATION_MODE check
